import numpy as np, pandas as pd
from matplotlib import pylab
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault) # 恢复matplotlib默认样式
import seaborn as sns
sns.set_style('whitegrid')
# plt.style.use('ggplot')
# https://tonysyu.github.io/raw_content/matplotlib-style-gallery/gallery.html
import warnings
warnings.filterwarnings('ignore')
x=np.linspace(0,10,50)
np.random.seed(10)
# stateful
plt.plot(x,np.sin(x)+x+np.random.randn(50)) # randn: standard normal distrubtion
plt.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
plt.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.title("Three Curves: Stateful (plt.plot)")
plt.show()
fig,ax0=plt.subplots(nrows=1) # Create a figure and a set of subplots.if multiple plots, return an array to ax.
ax0.plot(x,np.sin(x)+x+np.random.randn(50)) # randn: return standard normal distribution
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax0.set_title("Three Curves: Stateless (ax.plot)")
plt.show()
fig=plt.figure() # create a new figure
ax0=fig.add_subplot(211) # figure layout: 2x1. We will plot the first chart on [0,0]
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax1=fig.add_subplot(212)
ax1.plot(x,np.sin(x)+x+np.random.randn(50))
ax1.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax1.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.axhline(y=10,color='purple',linestyle='--')
fig.suptitle("Subplot: Stateless Only")
plt.show()
fig=plt.figure()
plt.subplot(211) # two rows and one column, [0,0]
plt.plot(x,np.sin(x)+x+np.random.randn(50))
plt.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
plt.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax0=fig.add_subplot(212) # two rows and one column, [1,0]
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
plt.axhline(y=10,color='purple',linestyle='--') # 可以放其他数值,比如平均数,中位数之类的。
fig.suptitle("Subplot: Stateful & Stateless")
plt.show()
x=np.linspace(0,10,50)
np.random.seed(10)
fig,ax0=plt.subplots(nrows=1)
ax0.plot(x,np.sin(x)+x+np.random.randn(50))
ax0.plot(x,np.sin(x)+0.5*x+np.random.randn(50))
ax0.plot(x,np.sin(x)+2*x+np.random.rand(50))
ax0.set_title("Three curves",fontsize=20)
# 1. adjust gridline type: dotted-line
ax0.grid(color='gray', alpha=0.5, linestyle='dotted') # alpha to adjust grid transparency
# or hide the grid:
# ax0.grid(False)
# 2. set x-axis label and y-axis label
ax0.set_xlabel('X')
ax0.set_ylabel('Randomization')
ax0.xaxis.label.set_size(15) # set xlabel size
ax0.yaxis.label.set_size(15) # set xlabel size
# # 3. adjust x-axis and y-axis data range
ax0.set_xticks(np.arange(min(x),max(x)+1,1)) # list of locations
# sns.despine(ax=ax0,left=True,bottom=True) # remove the left and bottom frame
plt.show() # used to suppress
x = np.linspace(0,10)
np.random.seed(10)
# 生成数据
y1 = np.sin(x)+x+np.random.randn(50)
y2 = np.sin(x)+0.5*x+np.random.randn(50)
y3 = np.sin(x)+2*x+np.random.randn(50)
df = pd.DataFrame({'serie1':y1,'serie2':y2,'serie3':y3})
fig = plt.figure()
fig.subplots_adjust(hspace=0.4)
i=1
for col in df.columns:
plt.subplot(df.shape[1],1,i)
plt.plot(df.loc[:,col])
plt.title(col,y=0.6,loc='right')
i+=1
plt.show()
df = pd.read_csv("UK-Bank-Customers.csv")
df.head()
| Customer ID | Name | Surname | Gender | Age | Region | Job Classification | Date Joined | Balance | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 100000001 | Simon | Walsh | Male | 21 | England | White Collar | 05.Jan.15 | 113810.15 |
| 1 | 400000002 | Jasmine | Miller | Female | 34 | Northern Ireland | Blue Collar | 06.Jan.15 | 36919.73 |
| 2 | 100000003 | Liam | Brown | Male | 46 | England | White Collar | 07.Jan.15 | 101536.83 |
| 3 | 300000004 | Trevor | Parr | Male | 32 | Wales | White Collar | 08.Jan.15 | 1421.52 |
| 4 | 100000005 | Deirdre | Pullman | Female | 38 | England | Blue Collar | 09.Jan.15 | 35639.79 |
df.columns = ['cust_id','first_name','last_name','gender','age','region','job','date_join', 'balance']
df['age_group'] = pd.cut(df['age'], bins=[15, 30, 50, float('Inf')], labels=['15-30', '30-50', 'Above 50'])
df.head()
| cust_id | first_name | last_name | gender | age | region | job | date_join | balance | age_group | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100000001 | Simon | Walsh | Male | 21 | England | White Collar | 05.Jan.15 | 113810.15 | 15-30 |
| 1 | 400000002 | Jasmine | Miller | Female | 34 | Northern Ireland | Blue Collar | 06.Jan.15 | 36919.73 | 30-50 |
| 2 | 100000003 | Liam | Brown | Male | 46 | England | White Collar | 07.Jan.15 | 101536.83 | 30-50 |
| 3 | 300000004 | Trevor | Parr | Male | 32 | Wales | White Collar | 08.Jan.15 | 1421.52 | 30-50 |
| 4 | 100000005 | Deirdre | Pullman | Female | 38 | England | Blue Collar | 09.Jan.15 | 35639.79 | 30-50 |
dt_region_mean_bal = df.groupby('region', as_index=False).agg({'balance': 'mean'})
dt_region_mean_bal['balance'] = np.rint(dt_region_mean_bal['balance'])
dt_region_mean_bal
| region | balance | |
|---|---|---|
| 0 | England | 39293.0 |
| 1 | Northern Ireland | 39505.0 |
| 2 | Scotland | 39511.0 |
| 3 | Wales | 42390.0 |
fig,ax0 = plt.subplots(nrows=1)
ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'])
# adjust: xlabel, ylabel,y-axis scale; adjust background color
# add title
ax0.set_title("Balance by region",fontsize=20)
ax0.set_xlabel('Region')
ax0.set_ylabel('Average balance')
ax0.xaxis.label.set_size(20) # set xlabel size
ax0.yaxis.label.set_size(20) # set xlabel size
ax0.set_ylim(top=70000)
ax0.grid(False)
# what about data label?
def autolabel(rects):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
ax0.text(rect.get_x() + rect.get_width()/2., 1.05*height,
'%d' % int(height),
ha='center', va='bottom') #ha=horizontal alignment
rect1=ax0.bar(dt_region_mean_bal['region'],dt_region_mean_bal['balance'],color='blue')
autolabel(rect1)
plt.show()
# Balance vs ( Region + Gender): Bar chart with multiple X
dt_rg_mean=df.groupby(['region','gender'])['balance'].mean().reset_index()
dt_rg_mean['balance']=np.rint(dt_rg_mean['balance'])
dt_rg_mean
| region | gender | balance | |
|---|---|---|---|
| 0 | England | Female | 39989.0 |
| 1 | England | Male | 38582.0 |
| 2 | Northern Ireland | Female | 38769.0 |
| 3 | Northern Ireland | Male | 41644.0 |
| 4 | Scotland | Female | 37306.0 |
| 5 | Scotland | Male | 40385.0 |
| 6 | Wales | Female | 40312.0 |
| 7 | Wales | Male | 44852.0 |
df.groupby(['region','gender'])['balance'].mean().unstack()
| gender | Female | Male |
|---|---|---|
| region | ||
| England | 39988.979505 | 38581.854270 |
| Northern Ireland | 38769.423567 | 41643.831296 |
| Scotland | 37306.419060 | 40385.072099 |
| Wales | 40312.093085 | 44852.180714 |
# use OOP + pandas plot
fig, ax1 = plt.subplots(nrows=1)
df.groupby(['region','gender'])['balance'].mean().unstack().plot(kind='bar',ax=ax1)
ax1.set_ylim(top=70000)
ax1.set_title("Balance by region and gender")
ax1.set_ylabel('Average Balance')
ax1.grid(False)
plt.show()
# without unstack()
fig,ax1=plt.subplots(nrows=1)
df.groupby(['region','gender'])['balance'].mean().plot(kind='bar',ax=ax1)
ax1.set_ylim(top=70000)
ax1.set_title("Balance by region and gender")
ax1.set_ylabel('Average Balance')
ax1.grid(False)
plt.show()
# two y-axies: example, plot conversions and conversion rate on the same chart.
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
df_ts = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD'))
df_ts = df_ts.cumsum() # reduce the randomness of the data in order to show dummy trend
df_ts.head()
| A | B | C | D | |
|---|---|---|---|---|
| 2000-01-01 | 0.739637 | 2.389500 | -0.990517 | 0.799171 |
| 2000-01-02 | -0.443838 | 1.018943 | -1.359769 | -0.260771 |
| 2000-01-03 | 0.563243 | 1.176798 | 0.260361 | 0.940663 |
| 2000-01-04 | -0.533139 | -0.308924 | -0.282023 | 1.899379 |
| 2000-01-05 | -0.521157 | 0.688589 | 0.153674 | 2.948773 |
df_ts.plot(secondary_y=['C', 'D'], mark_right=True)
plt.show()
# Another way for dual y-axis
fig=plt.figure()
ax0=fig.add_subplot(111)
df_ts.plot(use_index=True, y=['A','B'],ax=ax0)
ax1=ax0.twinx()
df_ts.plot(use_index=True, y=['C','D'], ax=ax1) # You can then setup color and legend place respectively
plt.show()
df_iris = pd.read_csv('iris.csv') # a dataframes
df_iris.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
import seaborn as sns
sns.set_style('whitegrid')
#stateful
sns.boxplot(x='species',y='sepal_length',data=df_iris)
plt.xticks(rotation=-45) # adjust xticks
plt.title('Aris species sepal_length boxplox') # add title
plt.show()
# boxplot UK bank client balance by age group, using seaborn and matplotlib
# stateless(OOP)
fig,ax2=plt.subplots(nrows=1)
sns.boxplot(x='age_group',y='balance',data=df,ax=ax2) # connect sns and matplotlib
ax2.grid(False)
ax2.set_title("Balance boxplot sliced by age_group")
plt.show()
# Balance distribution by difference groups
# stateless(OOP)
fig = plt.figure()
ax3 = fig.add_subplot(1,1,1)
sns.distplot(df.loc[df['age_group']=='15-30','balance'],label='15-30', hist=False,ax=ax3)
sns.distplot(df.loc[df['age_group']=='30-50','balance'],label='30-50', hist=False,ax=ax3)
sns.distplot(df.loc[df['age_group']=='Above 50','balance'],label='Above 50', hist=False,ax=ax3)
ax3.grid(False)
ax3.set_title("Balance distribution by age_group")
ax3.legend()
plt.show()
# barplot: return as object
df_titanic=sns.load_dataset('titanic')
g=sns.barplot(x="sex", y="survived", hue="class", ci=None,data=df_titanic)
g.set_ylabel('survival rate')
g.set_yticklabels(['{:3.2f}%'.format(x*100) for x in g.get_yticks()]) # if you want to show percentage for yticklabels
plt.show()
sns.pairplot(data=df_iris, hue="species") # feature correlation
plt.show()
df_iris.head()
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
fig=plt.figure(figsize=(8,16)) # X: control width, Y: control length
plt.subplot(2,1,1) # or ax0=fig.add_subplot(2,1,1)
sns.boxplot(data=df_iris, x='species',y='sepal_length')
plt.xticks(rotation=-45) # adjust xticks
plt.title('Iris species sepal_length boxplox') # add title
plt.subplot(2,1,2)
sns.distplot(df_iris['sepal_length'])
plt.show()
df_factor = pd.read_csv('factor.csv')
df_factor=df_factor.iloc[:,[1,2,3,4,5]]
df_factor.head()
| id | diet | pulse | time | kind | |
|---|---|---|---|---|---|
| 0 | 1 | low fat | 85 | 1 min | rest |
| 1 | 1 | low fat | 85 | 15 min | rest |
| 2 | 1 | low fat | 88 | 30 min | rest |
| 3 | 2 | low fat | 90 | 1 min | rest |
| 4 | 2 | low fat | 92 | 15 min | rest |
# g is a sns object
g=sns.catplot(x='time',
y='pulse',
data=df_factor,
hue='diet', # Color by diet
col='diet', # Separate by diet
kind='box') # Swarmplot
# Rotate x-axis labels
g.set_xticklabels(rotation=-45)
plt.show()
x = np.linspace(1,50,num=100)
epsilon = np.random.normal(0,3,size=100)
dt_lin = pd.DataFrame({'x':x, 'y':0.2+0.2*x + epsilon})
sns.set_style('whitegrid')
sns.regplot(x='x',y='y',data=dt_lin)
plt.show()
fig=plt.figure(figsize=(8,4))
ax0=fig.add_subplot(121)
sns.regplot(x='x',y='y',data=dt_lin,ax=ax0)
ax1=fig.add_subplot(122)
sns.regplot(x='x',y='y',data=dt_lin,ax=ax1)
ax1.grid(False)
ax1.set_title('Clean linear regression')
plt.show()
x = np.linspace(1,50, num = 100)
dt_poly=pd.DataFrame({'x':x,'y':0.2+0.3*np.power(x,2)})
sns.regplot(x='x', y='y',data=dt_poly,order=2, ci=None, scatter_kws={"s": 20, 'color': 'r'});
plt.show()
df_network = pd.read_csv('phone_data.csv')
df_network.head()
| index | date | duration | item | month | network | network_type | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 15/10/14 06:58 | 34.429 | data | 2014-11 | data | data |
| 1 | 1 | 15/10/14 06:58 | 13.000 | call | 2014-11 | Vodafone | mobile |
| 2 | 2 | 15/10/14 14:46 | 23.000 | call | 2014-11 | Meteor | mobile |
| 3 | 3 | 15/10/14 14:48 | 4.000 | call | 2014-11 | Tesco | mobile |
| 4 | 4 | 15/10/14 17:27 | 4.000 | call | 2014-11 | Tesco | mobile |
df_network['date'] = pd.to_datetime(df_network['date'],format="%d/%m/%y %H:%M")
# df_network['date'] = df_network['date'].dt.date
# df_network['date'] = pd.to_datetime(df_network['date'],format="%Y-%m-%d")
df_network_agg = df_network.query("item in ('data','call')").groupby(['date','network','item'])['duration'].mean().reset_index()
df_network_agg.head()
| date | network | item | duration | |
|---|---|---|---|---|
| 0 | 2014-10-15 06:58:00 | Vodafone | call | 13.000 |
| 1 | 2014-10-15 06:58:00 | data | data | 34.429 |
| 2 | 2014-10-15 14:46:00 | Meteor | call | 23.000 |
| 3 | 2014-10-15 14:48:00 | Tesco | call | 4.000 |
| 4 | 2014-10-15 17:27:00 | Tesco | call | 4.000 |
# 7 days moving average of duration
df_network_agg = df_network_agg.sort_values(by=['date','network','item'])
df_network_agg_mavg_temp = df_network_agg.groupby(['network','item'])['duration'].transform(lambda x:x.rolling(7,1).mean().round(0))
df_network_agg_mavg_temp.head()
0 13.0 1 34.0 2 23.0 3 4.0 4 4.0 Name: duration, dtype: float64
df_network_agg_mavg_temp = pd.merge(df_network_agg,df_network_agg_mavg_temp,how='inner',left_index=True,
right_index=True,suffixes=['','_7davg'])
df_network_agg_mavg_temp.fillna(0,inplace=True)
df_network_agg_mavg_temp.head()
| date | network | item | duration | duration_7davg | |
|---|---|---|---|---|---|
| 0 | 2014-10-15 06:58:00 | Vodafone | call | 13.000 | 13.0 |
| 1 | 2014-10-15 06:58:00 | data | data | 34.429 | 34.0 |
| 2 | 2014-10-15 14:46:00 | Meteor | call | 23.000 | 23.0 |
| 3 | 2014-10-15 14:48:00 | Tesco | call | 4.000 | 4.0 |
| 4 | 2014-10-15 17:27:00 | Tesco | call | 4.000 | 4.0 |
# plot a linechart, using seaborn, specify the hue and hue_order, line_style
network_order = df_network_agg_mavg_temp.query("date=='2014-11-01'").sort_values(by=['duration_7davg'])['network'].tolist()
network_order = df_network_agg_mavg_temp['network'].unique().tolist()
# specify colors, using palette
color_shaded_blue=['#1967D2','#1A73E8','#4285F4','#8AB4F8','#AECBFA','#D2E3FC','#E8F0FE'] # 蓝色由深到浅
fig,ax0 = plt.subplots(1,1,figsize=(14,10))
sns.lineplot(x='date',y='duration_7davg',hue='network',hue_order=network_order,
style='network',
style_order=network_order,
palette=color_shaded_blue,
data=df_network_agg_mavg_temp,ax=ax0,ci=False)
ax0.grid(False)
ax0.legend(loc=2,bbox_to_anchor=(1,1),prop={'size':15},frameon=False)
ax0.set_xlabel('')
ax0.set_ylabel('',fontsize=16)
ax0.set_title('Test title', fontdict={'fontsize':18,'fontweight':'medium'})
for label in (ax0.get_xticklabels()+ax0.get_yticklabels()):
label.set_fontsize(13)
sns.despine() # remove top and right spines
ax0.spines['left'].set_visible(False) # remove left spine
ax0.spines['bottom'].set_visible(False) # remove right spine
plt.show()
import plotly
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.express as px
# Use British Bank data as example:
dt_region_job_nums = df.groupby(['region','job'])['cust_id'].count().reset_index()
nodes=list(set(dt_region_job_nums['region'].values.tolist() + dt_region_job_nums['job'].values.tolist()))
dt_region_job_nums['indice_region'] = dt_region_job_nums['region'].apply(lambda x: nodes.index(x))
dt_region_job_nums['indice_job'] = dt_region_job_nums['job'].apply(lambda x: nodes.index(x))
dt_region_job_nums.head()
| region | job | cust_id | indice_region | indice_job | |
|---|---|---|---|---|---|
| 0 | England | Blue Collar | 344 | 2 | 0 |
| 1 | England | Other | 314 | 2 | 1 |
| 2 | England | White Collar | 1501 | 2 | 4 |
| 3 | Northern Ireland | Blue Collar | 41 | 3 | 0 |
| 4 | Northern Ireland | Other | 105 | 3 | 1 |
import plotly.graph_objects as go
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = nodes,
color = "blue"
),
link = dict(
source = dt_region_job_nums['indice_region'], # indices correspond to labels
target = dt_region_job_nums['indice_job'], # indices correspond to labels
value = dt_region_job_nums['cust_id']
))])
fig.update_layout(title_text="Customer region and job sankey chart", font_size=10)
fig.show()
# more details of Sankey chart: https://plotly.com/python/sankey-diagram/
# Plotly express
import plotly.express as px
my_template = 'plotly_dark'
df = px.data.gapminder()
df_2007 = df.query("year==2007")
fig = px.scatter(df_2007, x="gdpPercap", y="lifeExp",
size = 'pop',
hover_data=['country'], # df_2007.columns
color="continent",
title=f"""Life exp in '{my_template}'""",
log_x=True,size_max=60)
fig.update_layout(xaxis={"title":'gdpPercap',
"title_font":dict(size=15)},
yaxis={"title":'lifeExp',
"title_font":dict(size=15)
},
title_font_color='white',
title_x=0.5,
template=my_template
)
fig.show()
df = px.data.gapminder().query("year == 2007")
fig = px.sunburst(df, path=['continent', 'country'], values='pop',
color='lifeExp', hover_data=['iso_alpha'])
fig.update_layout(template=my_template)
fig.show()
import altair as alt
from vega_datasets import data
cars = data.cars()
alt.Chart(cars).mark_point().encode(
x='Horsepower',
y='Miles_per_Gallon',
color='Origin',
).interactive()